package com.webull.information.center.carwler.common.util.jsoup.prase_cn;

import java.io.IOException;
import java.text.ParseException;
import java.util.Date;
import java.util.Locale;
import java.util.Optional;

import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.webull.framework.util.UtilDate;
import com.webull.information.center.carwler.common.model.NewsInformation;
import com.webull.information.center.carwler.common.util.jsoup.HtmlBodyPrase;
import com.webull.information.center.carwler.common.util.jsoup.JsoupPraseUtil;
import com.webull.information.center.common.constants.Constants;

/**
 * 搜狐
 * 
 * @author shimingjun
 * @date 2016年8月23日 下午4:57:58
 * @version 1.0
 * @since JDK 1.8
 */

public class Sohu_HtmlPrase implements HtmlBodyPrase {
	protected final Logger logger = LogManager.getLogger(getClass());

	/**
	 * for example :http://news.163.com/16/0728/01/BT1ALRRQ00014AED.html
	 */
	@Override
	public void praseNewsInfo(org.jsoup.nodes.Document doc, NewsInformation info) {
		try {
			Element body = Optional.ofNullable(doc.getElementById("container"))
					.map(container0 -> container0.select("div.content-wrapper").first()).orElse(null);
			// .orElse(Optional.ofNullable(doc.getElementsByClass("endContent")).map(ends
			// -> ends.first()).orElse(null));
			// Element body = doc.getElementById("epContentLeft");

			if (body == null)
				return;
			// 设置标题
			if (StringUtils.isBlank(info.getLanguage())) {
				info.setLanguage(Constants.lang_zh);
			}
			Optional.ofNullable(body.select("h1[itemprop=headline]").first())
					.map(h1 -> StringUtils.stripToNull(h1.html())).ifPresent(str -> info.setTitle(str));
			if (StringUtils.isBlank(info.getTitle())) {
				Optional.ofNullable(body.select("h1").first()).map(h1 -> StringUtils.stripToNull(h1.html()))
						.ifPresent(str -> info.setTitle(str));
			}
			// 和pushTime

			Optional.ofNullable(body.select("div[itemprop=datePublished]").first())
					.map(pubt0 -> StringUtils.stripToNull(pubt0.attr("content"))).ifPresent(pdate -> {
						info.setPushTime(pdate);
						// content="2016-03-01T14:19:07+08:00"
						try {
							String day = StringUtils.stripToEmpty(StringUtils.substringBefore(pdate, "T"));
							String time = StringUtils.substringBefore(StringUtils.substringAfter(pdate, "T"), "+");
							String timeZone = StringUtils.removePattern(StringUtils.substringAfterLast(pdate, "+"),
									"0|:");
							if (!NumberUtils.isNumber(timeZone)) {
								timeZone = "0";
							}
							Date d = UtilDate.parse(day + " " + time, Integer.valueOf(timeZone), "yyyy-MM-dd HH:mm:ss");
							Optional.ofNullable(d).ifPresent(d0 -> info.setNewsTime(d));
						} catch (Exception e) {
						}
					});

			if (info.getNewsTime() == null) {
				Optional.ofNullable(body.select("div.time").first()).map(tim0 -> StringUtils.stripToNull(tim0.html()))
						.ifPresent(pdate -> {
							// 2013年06月05日23:54
							info.setPushTime(pdate);
							// <div class="time">2013年06月05日23:54</div>
							try {
								Date d = UtilDate.parse(pdate, Locale.CHINESE, 8, "yyyy年MM月dd日HH:mm");// 默认东八区
								Optional.ofNullable(d).ifPresent(d0 -> info.setNewsTime(d));
							} catch (Exception e) {
							}
						});
			}
			// 设置soure
			Element sounrce = Optional.ofNullable(body.getElementById("media_span"))
					.map(span0 -> span0.select("span[itemprop=name]").first()).orElse(null);
			if (sounrce == null) {
				sounrce = Optional.ofNullable(body.getElementById("media_span"))
						/* .map(span0 -> span0.select("a").first()) */.orElse(null);
			}
			Optional.ofNullable(sounrce).map(name0 -> StringUtils.stripToNull(name0.html()))
					.ifPresent(name1 -> info.setSourceName(name1));

			// 新闻正文
			Optional.ofNullable(body.select("div[itemprop=articleBody]").first())
					/* .map(artic0 -> artic0.children()artic0.select("p") ) */.ifPresent(artic0 -> {
						Elements ps0 = artic0.children();
						for (int i = ps0.size() - 1; ps0 != null && i >= 0; i--) {
							Element p = ps0.get(i);
							String tagName = p.tagName();
							if ("P".equalsIgnoreCase(tagName)) {
								JsoupPraseUtil.trimParagraph(p);
							}
							if ("a".equalsIgnoreCase(tagName)) { // 超链接
								JsoupPraseUtil.replaceWithText(p);
							}
							// 删除p的空白开头2
							if (!p.select("div.conserve-photo").isEmpty()) {
								p.select("div.conserve-photo").remove();// 保存到相册图片
																		// ,删除2
							}
							if (p.hasClass("muLink")
									|| !p.select("div.divstockguba").isEmpty() | "script".equalsIgnoreCase(tagName)
									|| ("P".equalsIgnoreCase(tagName) && !p.hasText())) {
								p.remove();
							}
						}

						info.setContent(StringUtils.stripToNull(artic0.html()));
					});
			// http://stock.sohu.com/20130605/n378120690.shtml l
			if (StringUtils.isBlank(info.getContent())) {
				Element content = body.getElementById("contentText");
				if (content != null) {
					for (int i = content.children().size() - 1; i >= 0; i--) {
						Element p = content.children().get(i);
						String tagName = p.tagName();
						JsoupPraseUtil.trimParagraph(p);

						if ("a".equalsIgnoreCase(p.tagName())) { // 超链接
							// p.replaceWith(new TextNode(p.html(), ""));
							JsoupPraseUtil.replaceWithText(p);
						}
						if (p.hasClass("muLink") || !p.select("div.divstockguba").isEmpty()
								|| "script".equalsIgnoreCase(tagName)
								|| ("P".equalsIgnoreCase(tagName) && !p.hasText())) {
							p.remove();
						}

					}
					info.setContent(StringUtils.stripToNull(content.html()));
				}
			}

		} catch (Exception e) {
			logger.warn(e);
		}
	}

	public static void main(String[] args) throws ParseException, IOException {
		String url2 = "http://money.sohu.com/20160301/n438999799.shtml";

		url2 = "http://money.sohu.com/20160301/n438999799.shtml";

		url2 = "http://business.sohu.com/20140325/n397172447.shtml";
		url2 = "http://stock.sohu.com/20130605/n378120690.shtml";
		url2 = "http://sports.sohu.com/20160719/n459894649.shtml";
		Connection connection = Jsoup.connect(url2).userAgent(
				"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
				// .header("x-client-data",
				// "CIq2yQEIpbbJAQjEtskBCP2VygEIwpjKAQjwnMoB")
				.header("x-client-data", RandomStringUtils.randomAlphanumeric(40));

		org.jsoup.nodes.Document doc = connection.timeout(10000).get();
		NewsInformation info = new NewsInformation();
		new Sohu_HtmlPrase().praseNewsInfo(doc, info);
		System.out.println(info);

	}
}
